This is a draft, the analysis is still on-going.
This document focuses on exploring the relationship between the census variables.
library(tidyverse)
library(magrittr)
library(knitr)
library(GGally)
Load the transformed census data.
census_data_trans <-
read_csv(
"../storage/dati-cpa_2011_all-trans-v0_0_3.csv",
col_types = paste(c(rep("c", 12), rep("d", 125)), collapse="")
)
Calculate the correlation between the transformed variables to identify those that might be excluded from the analysis.
candidate_vars <-
census_data_trans %>%
select(P1_norm_log10_std:E30_E31_norm_log10_std) %>%
colnames()
candidate_vars_cor <- NA
for (i in 1:(length(candidate_vars) - 1)) {
for(j in (i + 1):length(candidate_vars)) {
#cat("Calculating correlation between", candidate_vars[i], "and", candidate_vars[j],"\n")
census_data_trans_sample <-
census_data_trans %>%
slice_sample(prop = 0.01)
ij_cor_test <- cor.test(
census_data_trans_sample %>% pull(candidate_vars[i]),
census_data_trans_sample %>% pull(candidate_vars[j]),
method = "kendall"
)
if(i == 1 & j == 2){
candidate_vars_cor <-
tibble(
var_i = candidate_vars[i],
var_j = candidate_vars[j],
estimate = ij_cor_test %$% estimate %>% as.numeric(),
p_value = ij_cor_test %$% p.value %>% as.numeric()
)
} else {
candidate_vars_cor <-
candidate_vars_cor %>%
add_row(
var_i = candidate_vars[i],
var_j = candidate_vars[j],
estimate = ij_cor_test %$% estimate %>% as.numeric(),
p_value = ij_cor_test %$% p.value %>% as.numeric()
)
}
}
}
Further explore the most highly correlated variables.
correlations_cutoff_p_value <- 0.01
correlations_cutoff_estimate <- 0.5
candidate_vars_cor %>%
filter(
p_value < correlations_cutoff_p_value &
estimate > correlations_cutoff_estimate
) %>%
kable()
| var_i | var_j | estimate | p_value |
|---|---|---|---|
| P1_norm_log10_std | A44_norm_log10_std | 0.9020632 | 0 |
| P1_norm_log10_std | PF1_norm_log10_std | 0.9315714 | 0 |
| P1_norm_log10_std | E1_norm_log10_std | 0.5668273 | 0 |
| P2_norm_std | P9_norm_std | 0.6052338 | 0 |
| P2_norm_std | P53_norm_std | 0.8484472 | 0 |
| P9_norm_std | P53_norm_std | 0.5329157 | 0 |
| P17_norm_log10_std | P131_norm_log10_std | 0.5514256 | 0 |
| P29_norm_log10_std | P139_norm_std | 0.5288989 | 0 |
| P33_norm_std | P132_norm_std | 0.5292381 | 0 |
| P60_norm_std | P61_norm_std | 0.7663050 | 0 |
| P64_norm_std | P65_norm_std | 0.8096169 | 0 |
| ST1_norm_log10_std | ST2_norm_std | 0.5584053 | 0 |
| ST1_norm_log10_std | ST3_norm_std | 0.5104780 | 0 |
| ST2_norm_std | ST3_norm_std | 0.5243366 | 0 |
| A3_norm_std | A5_A6_A7_norm_std | 0.9730359 | 0 |
| A44_norm_log10_std | PF1_norm_log10_std | 0.9193828 | 0 |
| A44_norm_log10_std | E1_norm_log10_std | 0.5692832 | 0 |
| PF1_norm_log10_std | E1_norm_log10_std | 0.5693603 | 0 |
| PF2_norm_std | PF6_norm_log10_std | 0.5173849 | 0 |
| E20_norm_std | E24_E25_E26_norm_std | 0.6064776 | 0 |
| E20_norm_std | E27_norm_log10_std | 0.5406838 | 0 |
| E24_E25_E26_norm_std | E27_norm_log10_std | 0.7381162 | 0 |
correlations_to_explore <-
c(
candidate_vars_cor %>%
filter(
p_value < correlations_cutoff_p_value &
estimate > correlations_cutoff_estimate
) %>%
pull(var_i),
candidate_vars_cor %>%
filter(
p_value < correlations_cutoff_p_value &
estimate > correlations_cutoff_estimate
) %>%
pull(var_j)
) %>%
unique()
correlations_to_explore_panel <-
census_data_trans %>%
slice_sample(prop = 0.01) %>%
select({{correlations_to_explore}}) %>%
ggpairs(
upper = list(continuous = wrap(ggally_cor, method = "kendall")),
lower = list(continuous = wrap("points", alpha = 0.3, size=0.1))
)
print(correlations_to_explore_panel)
# ggsave(
# "../100-prep/111-classification-variable-selection-top-correlations.png",
# correlations_to_explore_panel,
# width = 600,
# height = 600,
# units = "mm",
# dpi=300
# )
The figure below is an annotated version of the plot above.